import re

def process_string(s):
    s = re.sub(r"<br />",r" ",s.lower()) # remove <br />
    s = re.sub(r'""',r'"',s)
    s = re.sub(r"([^'\w ])",r" \1 ",s) # 分割非'符号
    s = re.sub(r"(^[^\w]+)",r" \1 ",s)  # 分割句首'标点
    s = re.sub(r"('$)",r" \1",s)  # 分割句末'标点
    # s = re.sub(r"[\s]([^\w]+)",r" \1 ",s) # 分割词首标点
    # s = re.sub(r"([^\w]+[\s\b])",r" \1",s) # 分割词末标点
    s = re.sub(r"(?<=\w)'s",r" is",s)
    s = re.sub(r"(?<=\w)'d",r" would",s)
    s = re.sub(r"(?<=\w)'ll",r" will",s)
    s = re.sub(r"n't",r" not",s)
    s = re.sub(r" +",r" ",s)

    return s


def load_data(path, is_test=False):
    lines = []
    with open(path)as fp:
        while 1:
            try:
                line = fp.readline().strip()
                if len(line)<1:
                    break
                if is_test:
                    review = process_string(line[1:-4])
                    lines.append((review,-1))
                else:
                    label = int(line[-1])
                    review = process_string(line[1:-4])
                    lines.append((review,label))
            except:
                pass
    return lines

if __name__ == "__main__":
    path = '../data/Train.csv'
    lines = load_data(path=path)
    print("# examples:", len(lines))
    print(lines[1][0])
    print(lines[1][1])
    # s = "And isn't that what true art is supposed to be all about?"
    # print(process_string(s))